#Importing Important Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
import warnings
warnings.filterwarnings("ignore")
sns.set_style('whitegrid')
from IPython.display import display # Allows the use of display() for DataFrames
import visuals as vs
#Checking the Working Directory for the data inside
os.listdir()
data = pd.read_csv('census.csv')
data.head()
The step is very preliminary and important to have an insights into what is happening in the data as well as uncover insights
data.info()
data.isnull().sum()
#To determine the number of people earning >50K and <= 50K
data['income'].value_counts(normalize=True)*100
From the result above, we see that more people earn less than 50K from the total observations in the dataset. Also, the Target Variable which is to determine if a person earn above 50K or less than 50K. Transforming the column data to a numerical data that can be fed into our model
data['income'] = data['income'].apply(lambda x: 1 if x=='>50K' else 0)
data.head()
data.info()
data['income'].value_counts()
data.corr()['income']
data['workclass'].value_counts(normalize=True)*100
plt.figure(figsize=(7,5))
sns.countplot(x = 'workclass', data= data)
plt.tight_layout()
work_class={' Private':'Class_A',
' Self-emp-not-inc':'Class_B',
' State-gov':'Class_C',' Federal-gov':'Class_C',' Self-emp-inc':'Class_C',' Without-pay':'Class_C',' Local-gov':'Class_C' }
data['workclass'] = data['workclass'].replace(work_class)
data['workclass'].value_counts()
plt.figure(figsize=(7,5))
sns.countplot(x = 'workclass', data= data)
plt.tight_layout()
From the displayed result, we can see that we have more private working class in our data set
data['education_level'].value_counts(normalize=True)*100
plt.figure(figsize=(14,4))
sns.countplot(x = 'education_level', data= data)
plt.tight_layout()
From the data above we see that the High School Graduate seem to dominate our data set in our observations.
data['education_level'].value_counts(normalize=True)*100
def education_level(level):
if level in [' HS-grad']:
return 'Level_1'
elif level in [' Some-college']:
return 'Level_2'
elif level in [' Bachelors']:
return 'Level_3'
else:
return 'Level_4'
data['education_level'] = data['education_level'].apply(lambda x: education_level(x))
data['education_level'].value_counts()
plt.figure(figsize=(10,4))
sns.countplot(x = 'education_level', data= data)
plt.tight_layout()
data.describe(include=[np.object, pd.Categorical]).T
data['marital-status'].value_counts(normalize=True)*100
plt.figure(figsize=(10,4))
sns.countplot(x = 'marital-status', data= data)
plt.tight_layout()
def marital_status(status):
if status in [' Married-civ-spouse']:
return 'Status_1'
elif status in [' Never-married']:
return 'Status_2'
elif status in [' Divorced'] :
return 'Status_3'
else:
return 'Status_4'
data['marital-status'] = data['marital-status'].apply(lambda x: marital_status(x))
data['marital-status'].value_counts()
plt.figure(figsize=(10,4))
sns.countplot(x = 'marital-status', data= data)
plt.tight_layout()
data['occupation'].value_counts()
data['relationship'].value_counts()
plt.figure(figsize=(8,5))
sns.countplot(x = 'relationship', data= data)
plt.tight_layout()
data.info()
data['sex'].value_counts()
data['race'].value_counts()
plt.figure(figsize=(8,5))
sns.countplot(x = 'race', data= data)
plt.tight_layout()
sum(data['race']==' White')/len(data) * 100
From the result above, we see that White dominated the dataset with a 86%. Through this result we can decide to create another feature known as Is_white
data['Is_White']= data['race'].apply(lambda x: 'Yes' if x==' White' else 'No')
plt.figure(figsize=(8,5))
sns.countplot(x = 'Is_White', data= data)
plt.tight_layout()
plt.figure(figsize=(8,5))
sns.countplot(x = 'sex', data= data)
plt.tight_layout()
list(data.columns)
data.drop(['race'],axis=1,inplace=True)
plt.figure(figsize=(20,5))
sns.countplot(x = 'native-country', data= data)
plt.tight_layout()
From the result obtained above, we see that it is dominated with United-States native-country. Similarly we can create another feature known as Is_USA to determine is the perform is from USA or not.
data['native-country'].value_counts().head()
data['Is_USA']= data['native-country'].apply(lambda x: 'Yes' if x==' United-States' else 'No')
plt.figure(figsize=(8,5))
sns.countplot(x = 'Is_USA', data= data)
plt.tight_layout()
data.head()
#Plotting the scatter plot of all the features
cat_col = ['workclass','education_level','marital-status','occupation','relationship','sex','native-country','Is_White']
for col in cat_col:
sns.set()
cols=list(data.columns)
plt.figure()
sns.pairplot(data[cols],size=3,hue=col)
plt.show()
data.info()
_, axes = plt.subplots(nrows=4, ncols=1, figsize=(10, 6))
x1 = data['capital-gain']
x2= data['capital-loss']
x3 = data['hours-per-week']
x4 = data['income']
axes[0].hist(x1,bins=25)
axes[0].set(xlabel='capital-gain')
axes[1].hist(x2,bins=25)
axes[1].set(xlabel='capital-loss')
axes[2].hist(x3,bins=25)
axes[2].set(xlabel='hours-per-week')
axes[3].hist(x4,bins=25)
axes[3].set(xlabel='income')
plt.tight_layout()
data[['capital-gain','capital-loss']] = data[['capital-gain','capital-loss']].apply(lambda x: np.log(x+1))
data.head()
data['capital-gain'].plot.hist()
data['capital-gain'].plot.hist(bins=25)
plt.ylim((0, 2000))
plt.yticks([0, 500, 1000, 1500, 2000],[0, 500, 1000, 1500, ">2000"])
plt.tight_layout()
From the first plot of capital-gain, we see that no adequate information was displayed about the information as majority had <1000.Hence, in other to capture the other information i scaled the y-axis to capture all the other data which resulted to the positively skewed distribution in the 2nd chat. A skewed distribution will affect our model so it is important for it to be transformed as shown below using Logarithmic Transformation
(np.log(data['capital-gain'] + 1)).plot.hist(bins=25)
plt.ylim((0, 2000))
plt.yticks([0, 500, 1000, 1500, 2000],[0, 500, 1000, 1500, ">2000"])
plt.tight_layout()
data['capital-loss'].plot.hist()
data['capital-loss'].plot.hist(bins=25)
plt.ylim((0, 2000))
plt.yticks([0, 500, 1000, 1500, 2000],[0, 500, 1000, 1500, ">2000"])
plt.tight_layout()
(np.log(data['capital-loss'] + 1)).plot.hist(bins=25)
plt.ylim((0, 2000))
plt.yticks([0, 500, 1000, 1500, 2000],[0, 500, 1000, 1500, ">2000"])
plt.tight_layout()
data.head()
cat = ['workclass','education_level','marital-status','occupation','relationship',
'sex','native-country','Is_White','Is_USA']
data = pd.get_dummies(data,columns=cat,drop_first=True)
data.head()
data.shape
This stage involve performing necessary algorithm for our Model to learn from the data and evaluating it performances. The process includes;
from sklearn.model_selection import train_test_split
X = data.drop(['income'],axis=1)
y = data['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
from sklearn.metrics import accuracy_score, make_scorer,confusion_matrix,classification_report,fbeta_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
num_folds = 10
seed = 8
def get_best_model_and_accuracy(model, params, X, y):
kfold = KFold(n_splits=num_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=params,error_score=0., cv=kfold)
grid.fit(X, y) # fit the model and parameters
# our classical metric for performance
print ("Best Accuracy: {}".format(grid.best_score_))
# the best parameters that caused the best accuracy
print ("Best Parameters: {}".format(grid.best_params_))
# the average time it took a model to fit to the data (in seconds)
print ("Average Time to Fit (s): {}".format(round(grid.cv_results_['mean_fit_time'].mean(), 3)))
# the average time it took a model to predict out of sample data (in seconds)
# this metric gives us insight into how this model will perform in real-time analysis
print ("Average Time to Score (s): {}".format(round(grid.cv_results_['mean_score_time'].mean(), 3)))
print ('Best parameter.{}'.format(grid.best_estimator_))
# Import four machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
# Set up some parameters for our grid search
# We will start with four different machine learning models
# logistic regression, KNN, Decision Tree, and Random Forest
lr_params = {'C':[1e-1, 1e0, 1e1, 1e2], 'penalty':['l1', 'l2']}
tree_params = {'max_depth': [None, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]}
forest_params = {'n_estimators': [10,20,30,40,50,70,90,100], 'max_depth': [None, 1, 3, 5, 7,8,9]}
knn_params = {'n_neighbors': [1, 3, 5, 7]}
# instantiate the four machine learning models
lr = LogisticRegression()
knn = KNeighborsClassifier()
d_tree = DecisionTreeClassifier()
forest = RandomForestClassifier()
get_best_model_and_accuracy(lr, lr_params, X, y)
get_best_model_and_accuracy(d_tree, tree_params, X, y)
get_best_model_and_accuracy(forest, forest_params, X, y)
model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=9, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
model.fit(X_train,y_train)
prediction = model.predict(X_test)
print ('Accuracy: ' + str(accuracy_score(y_test,prediction)))
print()
print('Confusion matrix: '+ str(confusion_matrix(y_test,prediction)))
print ()
print ("Classification Report " + str(classification_report(y_test,prediction)))
print ("F_beta score : " + str(fbeta_score(y_test,prediction,beta=0.5)))
#
importances = model.feature_importances_
# Plot
vs.feature_plot(importances, X_train, y_train)